"""
本案例实现多进程爬虫
"""

# 导入Request
import requests
# 导入bs4和 lxml
from bs4 import BeautifulSoup
import lxml
# 导入多线程模块
import threading
# 导入队列
from queue import Queue
# 导入os
import os
# 导入下载图片模块
from urllib import request
# 导入正则表达式
import re
# 导入pymysql
import pymysql
# 导入日期模块
from datetime import date


class WeatherSpider:
    def __init__(self):
        self.all_city_url = "https://www.tianqi.com/chinacity.html"
        self.base_city_url = "https://www.tianqi.com/{}/15"

        self.pages_number = 24

        self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; " + \
                                      "Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + \
                                      "Chrome/69.0.3497.100 Safari/537.36"}
        # 数据库连接信息
        self.db_conn = {
            'NAME': 'DB06',
            'USER': 'root',
            'PASSWORD': '1234.Com',
            'HOST': '192.168.182.5',
        }
        # 定义一个集合存储地区信息
        self.city_list = []
        # 定义一个URL队列
        self.url_queue = Queue()
        # 定义一个html内容队列
        self.html_queue = Queue()
        # 定义一个content内容队列
        self.content_queue = Queue()

        # 自动获取城市URL
        self.get_city_url()

    def get_city_url(self):
        """获取所有城市对应的URL"""
        # 获取所有城市的html_str
        city_html_str = requests.get(self.all_city_url, headers=self.headers).content.decode("utf-8")
        # ======开始分析======
        soup = BeautifulSoup(city_html_str, 'lxml')
        # 第一步筛选
        first_filter = soup.find('div', class_='citybox')
        # 第二步筛选
        province_html = first_filter.find_all('h2')
        city_html = first_filter.find_all('span')

        # 开始循环
        for i, v in enumerate(province_html):
            temp_list = ['province', 'city', 'district', 'url']

            # 获取当前城市的名称
            province_name = v.find('a').text
            # 确定当前的省
            current_province = province_name
            # 确定当前的市集
            current_city = ""
            # 判断是不是直辖市--北京、上海、天津、重庆
            if province_name in ['北京', '上海', '天津', '重庆']:
                # 确定当前的市
                current_city = province_name
                # 切割所有的地区
                district_list = city_html[i].find_all('a')
                district_list.insert(0, v.find('a'))
                # 遍历
                for one in district_list:
                    # 实例化一个集合
                    temp_dict = {}.fromkeys(temp_list)
                    # 填写数字
                    temp_dict['province'] = current_province
                    temp_dict['city'] = current_city
                    temp_dict['district'] = one.text
                    temp_dict['url'] = str(one.attrs['href'])[1:len(str(one.attrs['href'])) - 1]
                    # 附加
                    self.city_list.append(temp_dict)

            # 非直辖市添加
            else:
                city_h3_list = city_html[i].find_all('h3')
                district_list = city_html[i].find_all('a')
                current_city = ""
                # 遍历地区
                for one in district_list:
                    # 定义一个集合
                    temp_dict = {}.fromkeys(temp_list)
                    # 确定当前的市
                    current_district = one.text
                    # 判断是不是地级市
                    if str(one) in str(city_h3_list):
                        current_city = one.text
                    # 实例化
                    temp_dict['province'] = current_province
                    temp_dict['city'] = current_city
                    temp_dict['district'] = current_district
                    temp_dict['url'] = str(one.attrs['href'])[1:len(str(one.attrs['href'])) - 1]
                    # 附加
                    self.city_list.append(temp_dict)

    def save_url_to_db(self):
        """保存城市到数据库"""
        mysql_conn = pymysql.connect(self.db_conn['HOST'], self.db_conn['USER'], self.db_conn['PASSWORD'],
                                     self.db_conn['NAME'])
        cursor = mysql_conn.cursor()
        try:
            for one in self.city_list:
                sql = "insert into citys (province, city,district, url) values (" + \
                      "'%s','%s','%s', '%s')" % (
                          one['province'], one['city'], one['district'], one['url'])

                cursor.execute(sql)
                mysql_conn.commit()
        except Exception as e:
            mysql_conn.rollback()
            print("写入数据库出现异常！具体原因：" + str(e))
        finally:
            mysql_conn.close()

    def get_url_list(self):
        """获取URL的列表"""
        # 填充url队列
        for city in self.city_list:
            # 添加到地址队列
            self.url_queue.put(self.base_city_url.format(city['url']))

    def parse_url(self):
        """根据列表爬取网页内容"""
        while True:
            # 在队列中取一个地址
            url = self.url_queue.get()
            # 获取页面的内容
            response = requests.get(url, headers=self.headers)
            # 获取response_text
            response_text = response.content.decode("utf-8")
            # 判断是否有返回值
            if len(response_text.strip()) > 0:
                # 添加到队列
                self.html_queue.put(response.content.decode("utf-8"))
            # 把取出来的这个队列明细结束
            self.url_queue.task_done()

    def get_content_list(self):
        """根据网页内容提取有效数据"""
        while True:
            # 取出获取队列中的html_str
            html_str = self.html_queue.get()
            # 实例化BS4对象
            soup = BeautifulSoup(html_str, 'lxml')
            try:
                # 第一次筛选
                first_filter = soup.find('ul', class_='weaul').find_all('li')
                # 定义一个天气
                days_weathers = {}
                # 开始循环
                for day in first_filter:
                    # 准备属性
                    infos = ['weather', 'min_temp', 'max_temp', 'air', 'wind', 'wind_number']
                    # 定义一个临时dict
                    one_day_dict = {}.fromkeys(infos)
                    # 获取天气
                    weather_str = day.find('div', class_='weaul_z').text
                    one_day_dict['weather'] = re.findall(r'[\u4E00-\u9FA5]{1,}', weather_str)[0]
                    one_day_dict['min_temp'] = int(re.findall(r'\d{1,}', weather_str)[0])
                    one_day_dict['max_temp'] = int(re.findall(r'\d{1,}', weather_str)[1])
                    # 获取的空气质量
                    air_str = day.find('div', class_='weaul_w').text
                    one_day_dict['air'] = air_str.split(" ")[1]
                    # 获取风向和风速
                    wind_str = day.find('div', class_='weaul_s').text
                    one_day_dict['wind'] = wind_str.split(" ")[0]
                    one_day_dict['wind_number'] = int(re.findall(r'\d{1,}', str(wind_str.split(" ")[1]))[0])
                    # 获取日期
                    date_str = day.find('div', class_='weaul_q').text[0:6]
                    date01 = date(2020, int(date_str[0:2]), int(date_str[3:5]))
                    # 附加到总的集合
                    days_weathers[date01] = one_day_dict

                # ====获取城市的url字符串===
                city_name_str = str(first_filter[0].find('a').attrs['href'])
                # 获取城市的url
                city_url = city_name_str[1:city_name_str.find('?') - 1]
                # 定义一个city_days——dict
                city_days_dict = {}
                city_days_dict[city_url] = days_weathers
                # ==========添加到队列========
                self.content_queue.put(city_days_dict)
            except:
                pass
            finally:
                # 完成当前的进程
                self.html_queue.task_done()

    def save_content_list(self):
        """把有效数据写入数据库"""
        # 连接数据库
        mysql_conn = pymysql.connect(self.db_conn['HOST'], self.db_conn['USER'], self.db_conn['PASSWORD'],
                                     self.db_conn['NAME'])
        cursor = mysql_conn.cursor()

        while True:

            # 获取队列中内容
            content_list = self.content_queue.get()
            # 获取最外层的key
            first_key = list(content_list.keys())[0]
            # 获取第二级key
            second_key = content_list[first_key].keys()
            # 遍历
            for one in second_key:
                # 获取sql语句
                sql = "Insert into weather (url,date,weather,low,high,air,wind,wind_scale) values ( " + \
                      "'%s','%s','%s','%s','%s','%s','%s','%s') " % (
                      first_key, one, content_list[first_key][one]['weather'],
                      content_list[first_key][one]['min_temp'], content_list[first_key][one]['max_temp'],
                      content_list[first_key][one]['air'], content_list[first_key][one]['wind'],
                      content_list[first_key][one]['wind_number'])
                cursor.execute(sql)
                mysql_conn.commit()
            # 关闭
            self.content_queue.task_done()

    def run(self):
        """执行整个爬取"""
        # 定义一个thread_list
        thread_list = []
        # 1. url_list
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        # 2. 遍历发送请求，获取响应

        t_parse = threading.Thread(target=self.parse_url)
        thread_list.append(t_parse)
        # 3. 提取数据
        t_content = threading.Thread(target=self.get_content_list)
        thread_list.append(t_content)
        # 4. 保存

        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)

        # 开启多线程
        for t in thread_list:
            t.setDaemon(True)  # 把子线程设置为守护进程，该进程不重要，主线程结束，子线程结束
            t.start()

        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join()

        print("主线程结束！")


if __name__ == '__main__':
    import time

    start = time.time()
    obj = WeatherSpider()
    # 保存全国所有的地区到数据库
    # obj.save_url_to_db()
    obj.run()

    end = time.time()
    print("程序执行时长：%.5f" % (end - start))
